Read in the TS objects.We set the index as the dates and remove the time/date column and the X column. The temperature file has 11 zones and I assume that those 11 zones are the same as the first zones in the load TS.
library(grid)
library(gridExtra)
library(rmarkdown)
TS <- read.csv("/Users/arijoh/Documents/Skóli/École Polytechnique/Data Analysis and Unsupervised Learning /Project/MAP573/Data/Complete_TS.csv")
TST <- read.csv("/Users/arijoh/Documents/Skóli/École Polytechnique/Data Analysis and Unsupervised Learning /Project/MAP573/Data/TST.csv")
row.names(TST) <- TST$datetime
TST = subset(TST, select = -c(X, datetime))
row.names(TS) <- TS$datetime
TS = subset(TS, select = -c(X, datetime, V10)) #Remove Z10 which is V11 because it behaves strangelyBecause we have so many zones, we might want to simplify our data and only use the sum over all the zones. If we do not do that, the temperature vs weight plot will have different clusters and the TS plot will also have more lines. A simple mean should be ok due to them having the same units.
## [1] 39576
TS_sum <- rowSums(TS[(1:dim(TS)[1]), ])
TST_mean <- rowMeans(TST[(1:dim(TS)[1]), ])
data <- as.data.frame(cbind(TST_mean, TS_sum))The figure shows some relations shipt between temperature and load but as temperature increases/decreases from the mean (around 56 °F), the energy load increases.
plot(TST_mean, TS_sum,
main = "Relationship between energy load and temperature",
xlab = "Temperature [F]",
ylab = "Load [xW]",
cex = 0.1,
panel.first = grid(nx = NULL, ny = NULL, col = "red", lty = "dotted"))We make time series obect with zoo and ts. The zoo object have the time but the ts object only has the order. Load_zoo_zones includes all the zones but the load_zoo includes the sum of all of the loads at a time unit.
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
library(ggplot2)
times <- as.POSIXct(row.names(data), format = "%Y-%m-%d %H:%M:")
timesNAomit <- na.omit(times) #for NA values we omit
load_zoo_zones <- zoo( # all zones
x = TS,
order.by = timesNAomit,
frequency = 24
)
load_zoo <- zoo( #sum of the zones
x = data$TS_sum,
order.by = timesNAomit,
frequency = 24
)
temperature_zoo <- zoo(
x = data$TST_mean,
order.by = timesNAomit,
frequency = 24
)
data_zoo <- cbind(load_zoo, temperature_zoo)
load_ts <- ts(TS_sum, start = as.numeric(times[1]), frequency = 24) Here we will test 2004 and see how each zone behaves throughout the year. Because the first figure shows that there is a lot of difference in the magintude of load between zones we standardize it and plot it again. The standardized plot is much nicer. There are some white lines in the plt (zone 4) which are outliers in the zone (values close to 0). What to do about this? similar thing was in zone 9 and was removed.
years <- c("2004", "2005", "2006", "2007", "2008")
year = years[1]
temp_start <- paste(c(year, "-01-", "01 01:00:00"), collapse = "")
temp_end <- paste(c(year, "-12-", "31 24:00:00"), collapse = "")
as.POSIXct(temp_start)## [1] "2004-01-01 01:00:00 CET"
ystart <- which(as.POSIXct(temp_start) == index(load_zoo_zones))
yend <- which(as.POSIXct(temp_end) == index(load_zoo_zones))-1 #-1 til þess að losna við firsta value á næsta ári
year_zoo_zones <- load_zoo_zones[ystart:yend]
TS_year <- as.matrix(coredata(year_zoo_zones[1:dim(year_zoo_zones)[1],1:dim(year_zoo_zones)[2]]))
par(mfrow=c(1,2))
image(x = 1:dim(year_zoo_zones)[1],
y = 1:dim(year_zoo_zones)[2],
z = TS_year,
xlab = "The first year [hours]", ylab = "Zones", main = "'Heatmap' of zones for the first year")
ts_st <- matrix(data=NA,nrow=20,ncol=1)
for (i in 1:dim(TS_year)[2]){
temp <- sd(TS_year[,i])
ts_st[i] <- temp
}
for (i in 1:dim(TS_year)[2]){ #skipta um zone
for (j in 1:dim(TS_year)[1]){ #dagar í einu zoni
TS_year[j,i] <- TS_year[j,i]/ts_st[i]
}
}
image(x = 1:dim(year_zoo_zones)[1],
y = 1:dim(year_zoo_zones)[2],
z = TS_year,
ylab = "zones",
xlab = "Hours",
main = "Standardized 'Heatmap' of zones for the first year")To compare years, we extract each year out of the data with each measurement sum of all load. We then plot imageplot and lineplot to compare the years. Because during the day, the measurements change more than throughout the months, we try to use smoothening by taking average load over each 24 hours. We then do the same for weeks and months.
I am here, start by making plots beautiful
#sum everything and compare years
smoothpars <- c(0,24,24*7, 24*30) #0 fyrir klukkustund, 24 fyrir dag, 24*7 fyrir vikur
size_year = c(0,0,0,0,0)
for (i in (1:(length(years)))){
year = years[i]
temp_start <- paste(c(year, "-01-", "01 01:00:00"), collapse = "")
temp_end <- paste(c(year, "-12-", "31 24:00:00"), collapse = "")
if (i == 5){yend = length(load_zoo)} else {yend <- which(as.POSIXct(temp_end) == index(load_zoo))-1} #-1 til þess að losna við firsta value á næsta ári
ystart <- which(as.POSIXct(temp_start) == index(load_zoo))
size_year[i] <- length(load_zoo[ystart:yend])
}
counter <- 1
labels <- c("Hours", "Days", "Weeks", "Months")
titles <- c("At each hour", "Average load over the day", "Average load over the week", "Average load over the months")
for (s in smoothpars){
year_zoo <- matrix(data = NA, nrow = 5, ncol = max(size_year))
for (i in (1:(length(years)))){
year = years[i]
temp_start <- paste(c(year, "-01-", "01 01:00:00"), collapse = "")
temp_end <- paste(c(year, "-12-", "31 24:00:00"), collapse = "")
as.POSIXct(temp_start)
if (i == 5){yend = length(load_zoo)} else {yend <- (which(as.POSIXct(temp_end) == index(load_zoo))-1)} #-1 til þess að losna við firsta value á næsta ári
ystart <- which(as.POSIXct(temp_start) == index(load_zoo))
temp <- coredata(load_zoo[ystart:yend])
year_zoo[i,1:length(temp)] <- temp
}
year_zoo <- as.matrix(coredata(year_zoo))
if (s > 0){
##Average accross 24 horys
year_zoo_smooth <- matrix(data = NA, nrow = 5, ncol = ceiling(max(size_year)/s))
idx1 <- 1
idx2 <- 1
while (idx2 <= dim(year_zoo)[2]-(s-1)){
temp <- year_zoo[,(idx2:(idx2+(s-1)))]
temp <- rowMeans(temp)
year_zoo_smooth[,idx1] <- coredata(temp)
idx2 <- idx2+s
idx1 <- idx1+1
}
year_zoo <- year_zoo_smooth
}
p1 <- image(x = 1:dim(year_zoo)[1],
y = 1:dim(year_zoo)[2],
z = year_zoo,
xlab = "The first year [hours]", ylab = "Zones", main = "'Heatmap' of zones for the first year")
year_zoo <- t(year_zoo)
colnames(year_zoo) <- c("year1", "year2", "year3", "year4", "year5")
year_zoo <- as.data.frame(year_zoo)
p2 <- ggplot(data=year_zoo, aes(x=index(year_zoo))) +
geom_line(aes(y=year1, color = "2004")) +
geom_line(aes(y=year2, color = "2005")) +
geom_line(aes(y=year3, color = "2006")) +
geom_line(aes(y=year4, color = "2007")) +
geom_line(aes(y=year5, color = "2008")) +
scale_colour_manual("",
breaks = c("2004", "2005", "2006", "2007", "2008"),
values = c("2004"="yellow", "2005"="red", "2006"="green", "2007"="blue", "2008"= "black"))+
xlab(labels[counter]) +
ylab('Load')+
labs(title = "Load over the time", subtitle = titles[counter])+
scale_fill_discrete(name="Year")
p1
print(p2)
counter <- counter + 1
}## Warning: Removed 24 rows containing missing values (geom_path).
## Warning: Removed 24 rows containing missing values (geom_path).
## Warning: Removed 24 rows containing missing values (geom_path).
## Warning: Removed 4415 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_path).
## Warning: Removed 185 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 28 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 7 rows containing missing values (geom_path).
load.means.diff <- diff(TS_sum)
d <- coredata(load.means.diff)
d <- as.data.frame(cbind(1:length(d), d))
p1 <- ggplot(d,aes(V1, d))+geom_line()+
labs(title = "Difference in load between time steps", xlav="Time", ylab="Difference")
d <- as.matrix(load.means.diff)
bw <- 2 * IQR(d) / length(d)^(1/3) #Freedman-Diaconis rule
d <- as.data.frame(d)
p2 <- ggplot(d, aes(x = V1)) +
labs(title= "Difference between each time step ",
y="Count",
x = "Difference")+
geom_histogram(binwidth = bw)
d <- coredata(TS_sum)
bw <- 2 * IQR(d) / length(d)^(1/3) #Freedman-Diaconis rule
p3 <- ggplot(, mapping = aes(x = d)) +
labs(title= "Histogram of load",
y="Count",
x = "Load")+
geom_histogram(binwidth = bw)
grid.arrange(p1, p2, p3, nrow = 3)We use the zoo library because the ts library does not support hourly collected data. First, the dates/hours are formatted and then a TS object is made out of them. This is done for both load and temperature. Then the data is plotted. The y axis og the ggplot has to be changed so that the left axis shows W and the right shows temeprature f.x.
p1 <- ggplot(data_zoo, aes(timesNAomit)) +
geom_line(aes(y = load_zoo)) +
labs(title= "Correlation of load and temperature (no units)",
y="Load", x = "Year")
p2 <- ggplot(data_zoo, aes(timesNAomit))+
geom_line(aes(y = temperature_zoo))+
labs(y="Temperature", x = "Year")
grid.arrange(p1, p2, nrow = 2)## Warning: Removed 13 rows containing missing values (geom_path).
Here, we investigate how values in the time series are correlated to one another. We do this for different amouts of lags to wether the data is seasonal. In following figure we can see that at lag 1, the autocorrelation is highest (not far from 1). When lags are increased the correlation between the variables decreases untill around 22-24, where the datas correlation increases a little. In the ACF plot this can be more clearly seen.
library(forecast)
p1 <- ggAcf(TS_sum)
p2 <- ggAcf(TS_sum, lag.max = 24*35)
grid.arrange(p1, p2, nrow = 2)The autocorrelation decreases from lag 1 but then around 15, it starts increasing, reacking a peak at 24. This behaviour repeats itself every 24 hours, decreasing a little every time. However, there are some intervals where it increases sligly (see bottom figure). Where the ACF increases slightly is the 6th and the 6th and the 7th lag. From this we have some information that there is a autocorrelation for lags 24, 48,72,… and there is also some autocorrelation increase for lags \[6*24\] and \[7*24\], meaning that the same weekdays have are more correlated than different ones.
To get the direct effect, we performed partial autocorrelation. Here it is easier to see what is going on. The first figure shows how autocorrelation drops, barely correlation around lag 10 but then goes up and is positively correlated for lags 15-16. The correlation is however greatest at lag 24 (nagative correlation). This is consistent with what we saw earlier.
In the next figure, the lags are extended to 200. It is clear that the PACF is the greatest at lag 24. However, every 24 lags it decreases up untill lags \[6*24\] and \[7*24\] (also onsistent with what we saw earlier).
Extending lags more, weekly correlation decreases.
There seems to be a seasonal period of 24 hours. Also, when lags are increased there seem to be another seasonal period of 7 days. Now to partial autocorrelation.
p1 <- ggAcf(TS_sum, type = "partial")
p2 <- ggAcf(TS_sum, type = "partial", lag.max = 24*7)
p3 <- ggAcf(TS_sum, type = "partial", lag.max = 24*7*4)
grid.arrange(p1, p2, p3, ncol = 2, layout_matrix = rbind(c(1,2), c(3,3)))There is a significant partial autocorrelation on lag 1, 2, 3, 13:17, 24,25, and 26. When the lag is higher, there seems to be partial autocorrelation every 24 hours and every 7 days, confirming a seasonal period of 24 hours as well as a weekly seasonal period. HOW TO CHECK FOR MORE SEASONS, YEAR OR MONTHS??
To compare with, we plot the first day, week, and month to see how the timeseries behaves.
p1 <- autoplot(ts(load_ts[1:24]), main="24 hours from baseline")+xlab("Time")+ylab("Load")
p2 <- autoplot(load_zoo[1:24*7], main="week from baseline")+xlab("Time")+ylab("Load")
p3 <- autoplot(load_zoo[1:24*7*4], main="month from baseline")+xlab("Time")+ylab("Load")
grid.arrange(p1, p2, p3, ncol = 2, layout_matrix = rbind(c(1,2), c(3,3)))N_obs <- sum(size_year)
D <- matrix(data=NA,nrow=N_obs,ncol=12)
colnames(D) <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
years = c("2004-", "2005-", "2006-", "2007-", "2008-", "2009-") #2009 ekki í datasetti
months = c("01-", "02-", "03-", "04-", "05-", "06-", "07-", "08-", "09-", "10-", "11-", "12-", "01-")
MaxIndex <- 1
for (y in 1:5){
year <- years[y]
for (m in 1:12){
MaxIndex_month <- 1
month <- months[m]
nextmonth <- months[m+1]
if (m == 12)
{
nextyear <- years[y+1]
temp_start <- paste(c(year, month, "01"), collapse = "")
temp_end <- paste(c(nextyear, nextmonth, "01"), collapse = "")
}
else{
temp_start <- paste(c(year, month, "01"), collapse = "")
temp_end <- paste(c(year, nextmonth, "01"), collapse = "")
}
sDay <- temp_start
eDay <-temp_end
monthData <- coredata(window(load_zoo, start = as.POSIXct(sDay), end = as.POSIXct(eDay)))
lengthMonth <- length(monthData)
D[MaxIndex:(MaxIndex+lengthMonth-1),m] <- monthData[1:lengthMonth]
if (lengthMonth > MaxIndex_month) {MaxIndex_month <- lengthMonth}
}
MaxIndex <- MaxIndex_month + MaxIndex
}
#Now boxplot with D
boxplot(D,
main = "Average energy load over the zones vs months",
na.action = NULL,
xlab = "Months",
ylab = "Energy load",
col = "orange",
border = "black")Does not improve our results
Transformation <- function(TS, Lambda)
{
w <- vector()
for(i in 1:length(TS)){
if (Lambda == 0){
w[i] <- log(TS[i])
}
else{
w[i] <- (((TS[i])^Lambda)-1)/Lambda
}
}
return(w)
}
BackTransform <- function(TS, Lambda){
w <- vector()
for(i in 1:length(TS)){
if (Lambda == 0){
w[i] <- exp(TS[i])
}
else{
w[i] <- (Lambda*TS[i]+1)^(1/Lambda)
}
}
return(w)
}
Lambda <- c(-1, 0, 1, 2)*2
for (i in Lambda){
TS_sum_transformed <- Transformation(TS_sum, i)
plot.zoo(TS_sum_transformed)
}Lambda <- BoxCox.lambda(TS_sum)
TS_sum_transformed <- Transformation(TS_sum, Lambda)
plot.zoo(TS_sum_transformed)TS_sum_backtransformed <- BackTransform(TS_sum_transformed, Lambda)
plot.zoo(TS_sum_backtransformed)